{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/greengerong/awesome-llm/blob/main/colab/save_load_dataset.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "nQ_-hTh61foy"
      },
      "source": [
        "This notebook regroups the code sample of the video below, which is a part of the [Hugging Face course](https://huggingface.co/course)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "9tY-NE2r1fpE",
        "outputId": "52558485-4c53-4dfc-c284-96daa31fe56c"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/blF9uxYcKHo?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>"
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "execution_count": null,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "#@title\n",
        "from IPython.display import HTML\n",
        "\n",
        "HTML('<iframe width=\"560\" height=\"315\" src=\"https://www.youtube.com/embed/blF9uxYcKHo?rel=0&amp;controls=0&amp;showinfo=0\" frameborder=\"0\" allowfullscreen></iframe>')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "YaJ2ieoP1fpO"
      },
      "source": [
        "Install the Transformers and Datasets libraries to run this notebook."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ehQGufi61fpQ"
      },
      "outputs": [],
      "source": [
        "! pip install datasets transformers[sentencepiece]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "OEzMI51f1fpR"
      },
      "outputs": [],
      "source": [
        "from datasets import load_dataset\n",
        "\n",
        "raw_datasets = load_dataset(\"allocine\")\n",
        "raw_datasets.cache_files"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "DU2PO-ru1fpS"
      },
      "outputs": [],
      "source": [
        "raw_datasets.save_to_disk(\"my-arrow-datasets\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "_78pu31Q1fpU"
      },
      "outputs": [],
      "source": [
        "from datasets import load_from_disk\n",
        "\n",
        "arrow_datasets_reloaded = load_from_disk(\"my-arrow-datasets\")\n",
        "arrow_datasets_reloaded"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "3FN_bKWx1fpU"
      },
      "outputs": [],
      "source": [
        "for split, dataset in raw_datasets.items():\n",
        "    dataset.to_csv(f\"my-dataset-{split}.csv\", index=None)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "bwq-6TIt1fpV"
      },
      "outputs": [],
      "source": [
        "data_files = {\n",
        "    \"train\": \"my-dataset-train.csv\",\n",
        "    \"validation\": \"my-dataset-validation.csv\",\n",
        "    \"test\": \"my-dataset-test.csv\",\n",
        "}\n",
        "\n",
        "csv_datasets_reloaded = load_dataset(\"csv\", data_files=data_files)\n",
        "csv_datasets_reloaded"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "f1OMRZrn1fpX"
      },
      "outputs": [],
      "source": [
        "# Save in JSON Lines format\n",
        "for split, dataset in raw_datasets.items():\n",
        "    dataset.to_json(f\"my-dataset-{split}.jsonl\")\n",
        "\n",
        "# Save in Parquet format\n",
        "for split, dataset in raw_datasets.items():\n",
        "    dataset.to_parquet(f\"my-dataset-{split}.parquet\")"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "OiIoXkM-1fpc"
      },
      "outputs": [],
      "source": [
        "json_data_files = {\n",
        "    \"train\": \"my-dataset-train.jsonl\",\n",
        "    \"validation\": \"my-dataset-validation.jsonl\",\n",
        "    \"test\": \"my-dataset-test.jsonl\",\n",
        "}\n",
        "\n",
        "parquet_data_files = {\n",
        "    \"train\": \"my-dataset-train.parquet\",\n",
        "    \"validation\": \"my-dataset-validation.parquet\",\n",
        "    \"test\": \"my-dataset-test.parquet\",\n",
        "}\n",
        "\n",
        "# Reload with the `json` script\n",
        "json_datasets_reloaded = load_dataset(\"json\", data_files=json_data_files)\n",
        "# Reload with the `parquet` script\n",
        "parquet_datasets_reloaded = load_dataset(\"parquet\", data_files=parquet_data_files)"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "name": "Saving and reloading a dataset",
      "provenance": [],
      "include_colab_link": true
    },
    "language_info": {
      "name": "python"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}